## *************************************
##        A2 Summary Statistics       
##         Hao Zhang & Ye Zhang
##             2025/7/24
## *************************************

library(readstata13)
library(stargazer)
library(tidyverse)
library(readxl)

## Read in city data
setwd("../raw data")
city <- read_excel("city panel 2010-2019.xlsx")
city <- city %>% select(city, year, gdp_growth, gdp, gdppc, fiscal_inc, fiscal_exp, FDI, hosp, unemp, pop, tax_absc_firm) %>%
  filter(year > 2010 & year < 2016) %>% as.data.frame()

## Read in citycode for matching
citycode <- read_excel("citycode.xlsx") %>% distinct()
citycode <- citycode[!duplicated(citycode[c("city")]), ]
city <- left_join(city, citycode, by = "city") %>% select(-city)
city <- city %>% mutate(across(where(is.character), ~parse_number(.)))

## calculate gdp pressure in the same city
city <- city %>% 
  mutate(city4 = as.integer(citycode/100)) %>% 
  filter(city4 != 1100, city4 != 1200, city4 != 3100, city4 != 5102) %>%
  filter(substr(as.character(city4), 3, 4) != "01") %>%
  mutate(province = as.integer(citycode/10000)) %>% 
  group_by(province, year) %>%
  mutate(gdpgrowth_mean = mean(gdp_growth, na.rm = TRUE)) %>%
  mutate(gdp_pressure = gdpgrowth_mean - gdp_growth)

## read in cfps data
load("../raw data/cfps 2012-2017.RData")
data1 <- data %>% filter(year <= 2015) %>% select(unemployment, medical, pension) 
stargazer(data1, omit.summary.stat = c("median", "p25", "p75")) # With adjustment

################################################
## read in mayor data
# summarize mayor data
mayor <- read.dta13("city mayor 1990-2015.dta")
mayor <- subset(mayor, year >= 2011 & year <= 2015)

# mayor summary statistics
mayor <- mayor %>% select(begin, seq, ethnicity, educ, gender)
stargazer(mayor, omit.summary.stat = c("median", "p25", "p75"))
length(mayor$ethnicity[mayor$ethnicity != ""]) # manual check to find 9 distinct values
length(mayor$educ[mayor$educ != ""]) # manual check to find 5 distinct values
length(mayor$gender[mayor$gender != ""]) # manual check to find 2 distinct values

## Read in city data for post period
city <- read_excel("city panel 2010-2019.xlsx")
city <- city %>% select(city, year, gdp_growth, gdp, gdppc, fiscal_inc, fiscal_exp, FDI, hosp, unemp, pop, tax_absc_firm) %>%
  filter(year > 2010 & year < 2016) %>% as.data.frame()

## Read in citycode for matching
citycode <- read_excel("citycode.xlsx")
city <- left_join(city, citycode, by = "city")
city <- city %>% mutate_if(is.character, as.numeric)

## calculate gdp pressure in the same city
city <- city %>% 
  mutate(city4 = as.integer(citycode/100)) %>%
  filter(substr(as.character(city4), 3, 4) != "01") %>%
  mutate(province = as.integer(citycode/10000)) %>% 
  group_by(province, year) %>%
  mutate(gdpgrowth_mean = mean(gdp_growth, na.rm = TRUE)) %>%  filter(gdp_growth <= 50 & gdp_growth >= -50)%>% 
  mutate(gdp_pressure = gdpgrowth_mean - gdp_growth) %>% ungroup()
city <- city %>% filter(city4 != 1100, city4 != 1200, city4 != 3100, city4 != 5102)
city$deficit <- city$fiscal_exp - city$fiscal_inc

# run inverse hyperbolic sine
ihs <- function(x) {
  y <- log(x + sqrt(x ^ 2 + 1))
  return(y)
}

## report summary statistics
city$gdp_pressure <- city$gdp_pressure/100
city$log_gdppc <- log(city$gdp/city$pop)
city$log_FDI <- log(city$FDI)
city$gdpgrowth <- city$gdp_growth/100
city$log_deficit <- ihs(city$deficit/city$gdp)
city$capacity <- ihs(city$tax_absc_firm/city$gdp)
city$unemppc <- city$unemp/city$pop/10000
city$hosppc <- city$hosp/city$pop/10000

city <- city %>% select(gdp_pressure, gdpgrowth, log_gdppc, log_deficit, capacity, log_FDI, unemppc, hosppc) %>% as.data.frame()
stargazer(city, omit.summary.stat = c("median", "p25", "p75"))
